1 Introduction

This report presents a comprehensive analysis of the Steam games dataset. The analysis includes data description, preprocessing, descriptive statistics, distribution fitting, hypothesis testing, ANOVA, and regression analysis. The goal is to gain insights into the factors that influence game success on the Steam platform.

2 Required Packages

# Install required packages if not already installed
required_packages <- c("tidyverse", "ggplot2", "dplyr", "tidyr", "lubridate", 
                      "stringr", "moments", "car", "fitdistrplus", "corrplot",
                      "knitr", "kableExtra", "gridExtra", "scales", "RColorBrewer",
                      "plotly", "viridis", "htmlwidgets")

# Function to check and install missing packages
check_and_install <- function(pkg) {
  if (!require(pkg, character.only = TRUE)) {
    install.packages(pkg, dependencies = TRUE)
    library(pkg, character.only = TRUE)
  }
}

# Check and install all required packages
invisible(sapply(required_packages, check_and_install))

3 Data Loading and Initial Exploration

# Load the dataset
file_path <- "C:/Users/Surya/Desktop/New folder/steam.csv"
steam_data <- read.csv(file_path, stringsAsFactors = FALSE)

# Dataset dimensions
cat(sprintf("Dataset dimensions: %d rows and %d columns\n", 
            nrow(steam_data), ncol(steam_data)))
## Dataset dimensions: 27075 rows and 18 columns

3.1 First Few Rows

# Display the first few rows
kable(head(steam_data, 5)) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
               font_size = 11) %>%
  scroll_box(width = "100%")
appid name release_date english developer publisher platforms required_age categories genres steamspy_tags achievements positive_ratings negative_ratings average_playtime median_playtime owners price
10 Counter-Strike 2000-11-01 1 Valve Valve windows;mac;linux 0 Multi-player;Online Multi-Player;Local Multi-Player;Valve Anti-Cheat enabled Action Action;FPS;Multiplayer 0 124534 3339 17612 317 10000000-20000000 7.19
20 Team Fortress Classic 1999-04-01 1 Valve Valve windows;mac;linux 0 Multi-player;Online Multi-Player;Local Multi-Player;Valve Anti-Cheat enabled Action Action;FPS;Multiplayer 0 3318 633 277 62 5000000-10000000 3.99
30 Day of Defeat 2003-05-01 1 Valve Valve windows;mac;linux 0 Multi-player;Valve Anti-Cheat enabled Action FPS;World War II;Multiplayer 0 3416 398 187 34 5000000-10000000 3.99
40 Deathmatch Classic 2001-06-01 1 Valve Valve windows;mac;linux 0 Multi-player;Online Multi-Player;Local Multi-Player;Valve Anti-Cheat enabled Action Action;FPS;Multiplayer 0 1273 267 258 184 5000000-10000000 3.99
50 Half-Life: Opposing Force 1999-11-01 1 Gearbox Software Valve windows;mac;linux 0 Single-player;Multi-player;Valve Anti-Cheat enabled Action FPS;Action;Sci-fi 0 5250 288 624 415 5000000-10000000 3.99

3.2 Structure of the Dataset

# Display column information
str(steam_data)
## 'data.frame':    27075 obs. of  18 variables:
##  $ appid           : int  10 20 30 40 50 60 70 80 130 220 ...
##  $ name            : chr  "Counter-Strike" "Team Fortress Classic" "Day of Defeat" "Deathmatch Classic" ...
##  $ release_date    : chr  "2000-11-01" "1999-04-01" "2003-05-01" "2001-06-01" ...
##  $ english         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ developer       : chr  "Valve" "Valve" "Valve" "Valve" ...
##  $ publisher       : chr  "Valve" "Valve" "Valve" "Valve" ...
##  $ platforms       : chr  "windows;mac;linux" "windows;mac;linux" "windows;mac;linux" "windows;mac;linux" ...
##  $ required_age    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ categories      : chr  "Multi-player;Online Multi-Player;Local Multi-Player;Valve Anti-Cheat enabled" "Multi-player;Online Multi-Player;Local Multi-Player;Valve Anti-Cheat enabled" "Multi-player;Valve Anti-Cheat enabled" "Multi-player;Online Multi-Player;Local Multi-Player;Valve Anti-Cheat enabled" ...
##  $ genres          : chr  "Action" "Action" "Action" "Action" ...
##  $ steamspy_tags   : chr  "Action;FPS;Multiplayer" "Action;FPS;Multiplayer" "FPS;World War II;Multiplayer" "Action;FPS;Multiplayer" ...
##  $ achievements    : int  0 0 0 0 0 0 0 0 0 33 ...
##  $ positive_ratings: int  124534 3318 3416 1273 5250 2758 27755 12120 3822 67902 ...
##  $ negative_ratings: int  3339 633 398 267 288 684 1100 1439 420 2419 ...
##  $ average_playtime: int  17612 277 187 258 624 175 1300 427 361 691 ...
##  $ median_playtime : int  317 62 34 184 415 10 83 43 205 402 ...
##  $ owners          : chr  "10000000-20000000" "5000000-10000000" "5000000-10000000" "5000000-10000000" ...
##  $ price           : num  7.19 3.99 3.99 3.99 3.99 3.99 7.19 7.19 3.99 7.19 ...

3.3 Summary Statistics

# Display summary statistics
summary_stats <- summary(steam_data)
summary_stats
##      appid             name           release_date          english      
##  Min.   :     10   Length:27075       Length:27075       Min.   :0.0000  
##  1st Qu.: 401230   Class :character   Class :character   1st Qu.:1.0000  
##  Median : 599070   Mode  :character   Mode  :character   Median :1.0000  
##  Mean   : 596204                                         Mean   :0.9811  
##  3rd Qu.: 798760                                         3rd Qu.:1.0000  
##  Max.   :1069460                                         Max.   :1.0000  
##   developer          publisher          platforms          required_age    
##  Length:27075       Length:27075       Length:27075       Min.   : 0.0000  
##  Class :character   Class :character   Class :character   1st Qu.: 0.0000  
##  Mode  :character   Mode  :character   Mode  :character   Median : 0.0000  
##                                                           Mean   : 0.3549  
##                                                           3rd Qu.: 0.0000  
##                                                           Max.   :18.0000  
##   categories           genres          steamspy_tags       achievements    
##  Length:27075       Length:27075       Length:27075       Min.   :   0.00  
##  Class :character   Class :character   Class :character   1st Qu.:   0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :   7.00  
##                                                           Mean   :  45.25  
##                                                           3rd Qu.:  23.00  
##                                                           Max.   :9821.00  
##  positive_ratings  negative_ratings average_playtime   median_playtime   
##  Min.   :      0   Min.   :     0   Min.   :     0.0   Min.   :     0.0  
##  1st Qu.:      6   1st Qu.:     2   1st Qu.:     0.0   1st Qu.:     0.0  
##  Median :     24   Median :     9   Median :     0.0   Median :     0.0  
##  Mean   :   1001   Mean   :   211   Mean   :   149.8   Mean   :   146.1  
##  3rd Qu.:    126   3rd Qu.:    42   3rd Qu.:     0.0   3rd Qu.:     0.0  
##  Max.   :2644404   Max.   :487076   Max.   :190625.0   Max.   :190625.0  
##     owners              price        
##  Length:27075       Min.   :  0.000  
##  Class :character   1st Qu.:  1.690  
##  Mode  :character   Median :  3.990  
##                     Mean   :  6.078  
##                     3rd Qu.:  7.190  
##                     Max.   :421.990

4 Data Cleaning and Preprocessing

4.1 Missing Values

# Check for missing values
missing_values <- colSums(is.na(steam_data))
missing_df <- data.frame(
  Column = names(missing_values),
  Missing_Values = missing_values
)
missing_df %>%
  filter(Missing_Values > 0) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Column Missing_Values
NA NA
:—— ————–:

4.2 Handling Missing Values

# Function to handle missing values
handle_missing_values <- function(df) {
  # For numeric columns, replace NA with median
  numeric_cols <- sapply(df, is.numeric)
  for (col in names(df)[numeric_cols]) {
    if (sum(is.na(df[[col]])) > 0) {
      df[[col]][is.na(df[[col]])] <- median(df[[col]], na.rm = TRUE)
    }
  }
  
  # For categorical columns, replace NA with "Unknown"
  categorical_cols <- sapply(df, is.character)
  for (col in names(df)[categorical_cols]) {
    if (sum(is.na(df[[col]])) > 0) {
      df[[col]][is.na(df[[col]])] <- "Unknown"
    }
  }
  
  return(df)
}

# Apply the function to handle missing values
steam_clean <- handle_missing_values(steam_data)

# Check missing values after handling
remaining_missing <- colSums(is.na(steam_clean))
cat("Remaining missing values after handling:", sum(remaining_missing))
## Remaining missing values after handling: 0

4.3 Processing Release Dates

# Process the release_date column to extract year
# First, convert to Date format, then extract year
steam_clean$release_date <- as.Date(steam_clean$release_date, format = "%Y-%m-%d", optional = TRUE)
steam_clean$release_year <- year(steam_clean$release_date)

# Handle missing years with median
median_year <- median(steam_clean$release_year, na.rm = TRUE)
steam_clean$release_year[is.na(steam_clean$release_year)] <- median_year

# Release year distribution
year_counts <- table(steam_clean$release_year)
year_df <- data.frame(
  Year = as.numeric(names(year_counts)),
  Count = as.numeric(year_counts)
)

# Create the interactive plot with plotly
year_plot <- year_df %>%
  arrange(desc(Count)) %>%
  head(10) %>%
  plot_ly(
    x = ~reorder(Year, -Count), 
    y = ~Count,
    type = 'bar',
    marker = list(
      color = colorRampPalette(c("lightblue", "steelblue", "darkblue"))(10),
      line = list(color = 'rgb(8,48,107)', width = 1.5)
    ),
    hoverinfo = "text",
    hovertext = ~paste("Year:", Year, "<br>Number of Games:", Count)
  ) %>%
  layout(
    title = "Top 10 Years by Game Count",
    xaxis = list(title = "Release Year", tickangle = 45),
    yaxis = list(title = "Number of Games"),
    hoverlabel = list(bgcolor = "white")
  )

# Display the plotly plot
year_plot
# INSIGHTS:
# The interactive bar chart clearly shows 2018 as the year with the highest number of game releases on Steam,
# followed by 2017. The significant drop in releases after 2018 is because of the data set which is only updated 
# until 2019 May 1

4.4 Processing Ownership Data

# Process the owners column to extract numeric representation
extract_owners_mean <- function(owners_range) {
  if (is.na(owners_range)) {
    return(NA)
  }
  # Extract numbers from the range (e.g., "10000-20000")
  bounds <- strsplit(owners_range, "-")[[1]]
  lower <- as.numeric(gsub(",", "", trimws(bounds[1])))
  upper <- as.numeric(gsub(",", "", trimws(bounds[2])))
  return((lower + upper) / 2)
}

steam_clean$owners_mean <- sapply(steam_clean$owners, extract_owners_mean)

# Ownership statistics
summary_owners <- summary(steam_clean$owners_mean)
summary_owners
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##     10000     10000     10000    134090     35000 150000000

4.5 Creating Price Categories

# Create a price category variable
steam_clean$price_category <- cut(
  steam_clean$price,
  breaks = c(-0.01, 0.01, 10, 20, 30, 100, 1000),
  labels = c("Free", "Budget", "Mid-range", "Premium", "AAA", "Collector")
)

# Price category distribution
price_cat_counts <- table(steam_clean$price_category)
price_cat_df <- data.frame(
  Category = names(price_cat_counts),
  Count = as.numeric(price_cat_counts)
)

# Create interactive plot for price categories
plot_ly(
  price_cat_df, 
  x = ~reorder(Category, -Count), 
  y = ~Count,
  type = "bar",
  marker = list(
    color = plasma(6, alpha = 0.8),
    line = list(color = "rgb(58,54,107)", width = 1.5)
  ),
  hoverinfo = "text",
  hovertext = ~paste("Category:", Category, "<br>Number of Games:", Count, 
                     "<br>Percentage:", round(Count/sum(Count)*100, 1), "%")
) %>%
  layout(
    title = list(text = "Distribution of Price Categories", font = list(size = 16)),
    xaxis = list(title = "Price Category", tickangle = 45),
    yaxis = list(title = "Number of Games"),
    hoverlabel = list(bgcolor = "white")
  )
# INSIGHTS:
# The price distribution visualization reveals that budget-priced games (under $10) dominate the Steam 
# marketplace, followed by free games. Premium-priced and AAA-priced games are much less common, indicating
# that most developers target the lower price tiers. The hover information shows both counts and percentages.

4.6 Calculating Ratings Ratio

# Calculate the ratio of positive to total ratings
steam_clean$total_ratings <- steam_clean$positive_ratings + steam_clean$negative_ratings
steam_clean$positive_ratio <- steam_clean$positive_ratings / steam_clean$total_ratings

# To avoid NaN from division by zero, replace with 0
steam_clean$positive_ratio[is.na(steam_clean$positive_ratio)] <- 0

# Positive ratings ratio statistics
summary(steam_clean$positive_ratio)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.5833  0.7603  0.7145  0.8939  1.0000
# Create an interactive histogram for positive ratings ratio
plot_ly(
  steam_clean, 
  x = ~positive_ratio,
  type = "histogram",
  nbinsx = 50,
  marker = list(
    color = "rgba(70, 130, 180, 0.7)",
    line = list(color = "rgba(8, 48, 107, 1)", width = 1)
  ),
  hoverinfo = "y"
) %>%
  layout(
    title = list(text = "Distribution of Positive Ratings Ratio", font = list(size = 16)),
    xaxis = list(title = "Positive Ratings / Total Ratings", range = c(0, 1)),
    yaxis = list(title = "Number of Games"),
    bargap = 0.1
  )
# INSIGHTS:
# The ratings distribution shows a distinct right skew with most games having a positive ratings ratio
# above 0.7, suggesting that Steam users generally rate games positively or that lower-rated games
# may be removed from the platform. There's a noticeable peak near 1.0, indicating many games with
# very high approval ratings.

4.7 Processing Platform Information

# Process platforms into binary columns
steam_clean$has_windows <- as.integer(grepl("windows", tolower(steam_clean$platforms)))
steam_clean$has_mac <- as.integer(grepl("mac", tolower(steam_clean$platforms)))
steam_clean$has_linux <- as.integer(grepl("linux", tolower(steam_clean$platforms)))

# Platform distribution
platform_counts <- c(
  Windows = sum(steam_clean$has_windows),
  Mac = sum(steam_clean$has_mac),
  Linux = sum(steam_clean$has_linux)
)

platform_df <- data.frame(
  Platform = names(platform_counts),
  Count = platform_counts
)

# Create interactive plot for platform distribution
plot_ly(
  platform_df, 
  x = ~reorder(Platform, -Count), 
  y = ~Count,
  type = "bar",
  marker = list(
    color = c("#2C7BB6", "#D7191C", "#FDAE61"),
    line = list(color = "rgb(8,48,107)", width = 1.5)
  ),
  hoverinfo = "text",
  hovertext = ~paste("Platform:", Platform, "<br>Number of Games:", Count, 
                     "<br>Percentage of Total:", round(Count/max(Count)*100, 1), "%")
) %>%
  layout(
    title = list(text = "Games Available by Platform", font = list(size = 16)),
    xaxis = list(title = "Platform"),
    yaxis = list(title = "Number of Games"),
    hoverlabel = list(bgcolor = "white")
  )
# INSIGHTS:
# Windows dominates the platform availability with nearly all Steam games supporting it.
# Mac support is available for roughly half as many games, while Linux has the least support.
# This confirms Windows' continued dominance in PC gaming. The interactive chart shows the
# percentage of games supporting each platform relative to Windows.

# Create a platform count variable
steam_clean$platform_count <- steam_clean$has_windows + steam_clean$has_mac + steam_clean$has_linux

# Platform count distribution
platform_count_table <- table(steam_clean$platform_count)
platform_count_df <- data.frame(
  Number_of_Platforms = as.numeric(names(platform_count_table)),
  Count = as.numeric(platform_count_table)
)

# Create interactive plot for platform count
plot_ly(
  platform_count_df, 
  x = ~as.factor(Number_of_Platforms), 
  y = ~Count,
  type = "bar",
  marker = list(
    color = viridis(3, alpha = 0.8),
    line = list(color = "rgb(8,48,107)", width = 1.5)
  ),
  hoverinfo = "text",
  hovertext = ~paste("Platforms Supported:", Number_of_Platforms, 
                     "<br>Number of Games:", Count,
                     "<br>Percentage:", round(Count/sum(Count)*100, 1), "%")
) %>%
  layout(
    title = list(text = "Number of Supported Platforms per Game", font = list(size = 16)),
    xaxis = list(title = "Number of Platforms"),
    yaxis = list(title = "Number of Games"),
    hoverlabel = list(bgcolor = "white")
  )
# INSIGHTS:
# This visualization shows that while many games are Windows-only (supporting just one platform),
# a significant number of games support all three platforms (Windows, Mac, and Linux). Very few games
# support exactly two platforms, suggesting developers either focus solely on Windows or aim for
# full cross-platform compatibility.

4.8 Processing Categories and Genres

# Function to get top categories/genres
get_top_categories <- function(column, n = 10) {
  all_categories <- unlist(strsplit(na.omit(steam_clean[[column]]), ";"))
  all_categories <- trimws(all_categories)
  cat_counts <- table(all_categories)
  return(sort(cat_counts, decreasing = TRUE)[1:n])
}

# Top 10 Categories
top_categories <- get_top_categories("categories", 10)
cat_df <- data.frame(
  Category = names(top_categories),
  Count = as.numeric(top_categories)
)

# Create interactive plot for categories
cat_plot <- plot_ly(
  cat_df, 
  x = ~reorder(Category, -Count), 
  y = ~Count,
  type = 'bar',
  marker = list(
    color = viridis(10, alpha = 0.8),
    line = list(color = 'rgb(8,48,107)', width = 1.5)
  ),
  hoverinfo = "text",
  hovertext = ~paste("Category:", Category, "<br>Count:", Count)
) %>%
  layout(
    title = list(text = "Top 10 Game Categories", font = list(size = 16)),
    xaxis = list(title = "Category", tickangle = 45),
    yaxis = list(title = "Count"),
    hoverlabel = list(bgcolor = "white")
  )

cat_plot
# Top 10 Genres
top_genres <- get_top_categories("genres", 10)
genre_df <- data.frame(
  Genre = names(top_genres),
  Count = as.numeric(top_genres)
)

# Create interactive plot for genres
genre_plot <- plot_ly(
  genre_df, 
  x = ~reorder(Genre, -Count), 
  y = ~Count,
  type = 'bar',
  marker = list(
    color = plasma(10, alpha = 0.8),
    line = list(color = 'rgb(58,54,107)', width = 1.5)
  ),
  hoverinfo = "text",
  hovertext = ~paste("Genre:", Genre, "<br>Count:", Count)
) %>%
  layout(
    title = list(text = "Top 10 Game Genres", font = list(size = 16)),
    xaxis = list(title = "Genre", tickangle = 45),
    yaxis = list(title = "Count"),
    hoverlabel = list(bgcolor = "white")
  )

genre_plot
# Create binary columns for top genres
for (genre in names(top_genres)[1:5]) {
  genre_col_name <- paste0("genre_", genre)
  steam_clean[[genre_col_name]] <- as.integer(grepl(genre, steam_clean$genres, fixed = TRUE))
}

# INSIGHTS:
# The interactive bar charts reveal that "Single-player" is by far the most common game category,
# followed by "Steam Achievements" and "Steam Trading Cards". For genres, "Indie" dominates the
# Steam platform, followed by "Action" and "Casual". This shows the significant presence of
# independent developers on Steam and the popularity of action-oriented gameplay.

5 Descriptive Statistics and Visualizations

5.1 Games by Release Year

# Distribution of games by release year
year_data <- data.frame(table(steam_clean$release_year))
names(year_data) <- c("Year", "Count")
year_data$Year <- as.numeric(as.character(year_data$Year))
year_data <- year_data[year_data$Year >= 1990 & year_data$Year <= 2024, ]

# Create interactive plot for release year trends
year_data_filtered <- year_data
color_scale <- colorRampPalette(c("lightblue", "steelblue", "darkblue"))(length(year_data_filtered$Year))
colors_by_year <- color_scale[rank(year_data_filtered$Count)]

plot_ly(
  year_data_filtered,
  x = ~Year,
  y = ~Count,
  type = "scatter",
  mode = "lines+markers",
  line = list(color = 'steelblue', width = 2),
  marker = list(color = colors_by_year, size = 8),
  hoverinfo = "text",
  hovertext = ~paste("Year:", Year, "<br>Number of Games:", Count)
) %>%
  layout(
    title = list(text = "Number of Games Released by Year", font = list(size = 16)),
    xaxis = list(title = "Release Year", tickmode = "array", tickvals = seq(1990, 2024, by = 5)),
    yaxis = list(title = "Number of Games"),
    hoverlabel = list(bgcolor = "white")
  )
# INSIGHTS:
# The timeline reveals a dramatic acceleration in game releases starting around 2012, with an exponential
# increase through 2018. This corresponds to Steam's growth and the rise of indie game development.
# The visualization shows how the platform evolved from having few releases in the early years to
# becoming a major publishing platform by the mid-2010s.

5.2 Price Distribution

# Create interactive histogram for price distribution
plot_ly(
  steam_clean, 
  x = ~price,
  type = "histogram",
  nbinsx = 50,
  marker = list(
    color = "rgba(70, 130, 180, 0.7)",
    line = list(color = "rgba(8, 48, 107, 1)", width = 1)
  ),
  hoverinfo = "y"
) %>%
  layout(
    title = list(text = "Distribution of Game Prices", font = list(size = 16)),
    xaxis = list(title = "Price (USD)", range = c(0, 100)),
    yaxis = list(title = "Number of Games"),
    bargap = 0.1
  )
# Price distribution with log scale (interactive)
plot_ly(
  steam_clean, 
  x = ~price,
  type = "histogram",
  nbinsx = 50,
  marker = list(
    color = "rgba(70, 130, 180, 0.7)",
    line = list(color = "rgba(8, 48, 107, 1)", width = 1)
  ),
  hoverinfo = "y"
) %>%
  layout(
    title = list(text = "Distribution of Game Prices (Log Scale)", font = list(size = 16)),
    xaxis = list(title = "Price (USD)", range = c(0, 100)),
    yaxis = list(title = "Number of Games", type = "log"),
    bargap = 0.1
  )
# INSIGHTS:
# The price distribution is heavily skewed, with most games priced below $20. There's a significant
# concentration at certain price points (e.g., $0, $9.99, $19.99), suggesting strategic pricing by developers
# at these psychological price points. Very few games are priced above $60, which aligns with traditional
# AAA game pricing limits.

5.3 Relationship Between Price and Ratings

# Create a subset with fewer points for better visualization
set.seed(123)
price_rating_sample <- steam_clean[sample(nrow(steam_clean), min(5000, nrow(steam_clean))), ]

# Create interactive scatterplot for price vs ratings
plot_ly(
  price_rating_sample,
  x = ~price,
  y = ~positive_ratio,
  type = "scatter",
  mode = "markers",
  marker = list(
    color = ~price,
    colorscale = "Viridis",
    size = 7,
    opacity = 0.7,
    showscale = TRUE,
    colorbar = list(title = "Price ($)")
  ),
  hoverinfo = "text",
  hovertext = ~paste("Price: $", price, "<br>Positive Ratio:", round(positive_ratio, 2),
                     "<br>Name:", name)
) %>%
  layout(
    title = list(text = "Relationship between Price and Positive Ratings Ratio", font = list(size = 16)),
    xaxis = list(title = "Price (USD)", range = c(0, 100)),
    yaxis = list(title = "Positive Ratings Ratio"),
    hoverlabel = list(bgcolor = "white")
  )
# Create interactive boxplot for ratings by price category
plot_ly(
  steam_clean,
  y = ~positive_ratio,
  color = ~price_category,
  type = "box",
  colors = viridis(6),
  hoverinfo = "text",
  hovertext = ~paste("Category:", price_category, 
                     "<br>Median Positive Ratio:", 
                     round(median(positive_ratio[price_category == price_category], na.rm = TRUE), 2))
) %>%
  layout(
    title = list(text = "Positive Ratings Ratio by Price Category", font = list(size = 16)),
    xaxis = list(title = "Price Category"),
    yaxis = list(title = "Positive Ratings Ratio"),
    hoverlabel = list(bgcolor = "white"),
    showlegend = FALSE
  )
# INSIGHTS:
# The scatterplot reveals a slight positive correlation between price and ratings, with higher-priced
# games generally receiving better ratings. However, there's considerable variability, especially
# in the $10-30 range. The boxplot further confirms that premium and AAA-priced games tend to have
# higher median positive ratings than free or budget titles, suggesting higher-priced games generally
# deliver better quality experiences.

5.4 Average Playtime Analysis

# Average playtime by genre
# Get the top 5 genre names
top_5_genres <- names(top_genres)[1:5]
genre_time <- data.frame(Genre = character(), AveragePlaytime = numeric())

for (genre in top_5_genres) {
  genre_col <- paste0("genre_", genre)
  avg_time <- mean(steam_clean$average_playtime[steam_clean[[genre_col]] == 1], na.rm = TRUE)
  genre_time <- rbind(genre_time, data.frame(Genre = genre, AveragePlaytime = avg_time))
}

genre_time <- genre_time[order(-genre_time$AveragePlaytime),]

# Create interactive horizontal bar chart for genre playtime
plot_ly(
  genre_time,
  y = ~reorder(Genre, AveragePlaytime),
  x = ~AveragePlaytime,
  type = "bar",
  orientation = "h",
  marker = list(
    color = viridis(nrow(genre_time), alpha = 0.8),
    line = list(color = "rgb(8,48,107)", width = 1.5)
  ),
  hoverinfo = "text",
  hovertext = ~paste("Genre:", Genre, "<br>Average Playtime:", round(AveragePlaytime, 1), "minutes",
                     "<br>(", round(AveragePlaytime/60, 1), "hours)")
) %>%
  layout(
    title = list(text = "Average Playtime by Genre", font = list(size = 16)),
    yaxis = list(title = ""),
    xaxis = list(title = "Average Playtime (minutes)"),
    hoverlabel = list(bgcolor = "white")
  )
# Average playtime by price category
price_time <- steam_clean %>%
  group_by(price_category) %>%
  summarize(AveragePlaytime = mean(average_playtime, na.rm = TRUE)) %>%
  arrange(desc(AveragePlaytime))

# Create interactive horizontal bar chart for price category playtime
plot_ly(
  price_time,
  y = ~reorder(price_category, AveragePlaytime),
  x = ~AveragePlaytime,
  type = "bar",
  orientation = "h",
  marker = list(
    color = plasma(nrow(price_time), alpha = 0.8),
    line = list(color = "rgb(58,54,107)", width = 1.5)
  ),
  hoverinfo = "text",
  hovertext = ~paste("Price Category:", price_category, 
                     "<br>Average Playtime:", round(AveragePlaytime, 1), "minutes",
                     "<br>(", round(AveragePlaytime/60, 1), "hours)")
) %>%
  layout(
    title = list(text = "Average Playtime by Price Category", font = list(size = 16)),
    yaxis = list(title = ""),
    xaxis = list(title = "Average Playtime (minutes)"),
    hoverlabel = list(bgcolor = "white")
  )
# INSIGHTS:
# The genre analysis reveals which types of games tend to be most engaging for players, with certain genres
# showing significantly higher average playtimes. The price category visualization demonstrates that
# higher-priced games generally maintain player engagement for longer periods, likely due to more content
# or depth of gameplay.

5.5 Platform Analysis

# Average playtime by platform count
platform_time <- steam_clean %>%
  group_by(platform_count) %>%
  summarize(
    AveragePlaytime = mean(average_playtime, na.rm = TRUE),
    Count = n()
  )

# Create interactive bar chart for platform count vs playtime
plot_ly(
  platform_time,
  x = ~as.factor(platform_count),
  y = ~AveragePlaytime,
  type = "bar",
  marker = list(
    color = viridis(nrow(platform_time), alpha = 0.8),
    line = list(color = "rgb(8,48,107)", width = 1.5)
  ),
  hoverinfo = "text",
  hovertext = ~paste("Platforms:", platform_count, 
                     "<br>Average Playtime:", round(AveragePlaytime, 1), "minutes",
                     "<br>(", round(AveragePlaytime/60, 1), "hours)",
                     "<br>Number of Games:", Count)
) %>%
  layout(
    title = list(text = "Average Playtime by Platform Count", font = list(size = 16)),
    xaxis = list(title = "Number of Platforms"),
    yaxis = list(title = "Average Playtime (minutes)"),
    hoverlabel = list(bgcolor = "white")
  )
# Average ratings by platform count
platform_ratings <- steam_clean %>%
  group_by(platform_count) %>%
  summarize(
    AveragePositiveRatio = mean(positive_ratio, na.rm = TRUE),
    Count = n()
  )

# Create interactive bar chart for platform count vs ratings
plot_ly(
  platform_ratings,
  x = ~as.factor(platform_count),
  y = ~AveragePositiveRatio,
  type = "bar",
  marker = list(
    color = plasma(nrow(platform_ratings), alpha = 0.8),
    line = list(color = "rgb(58,54,107)", width = 1.5)
  ),
  hoverinfo = "text",
  hovertext = ~paste("Platforms:", platform_count, 
                     "<br>Average Positive Ratio:", round(AveragePositiveRatio, 3),
                     "<br>Number of Games:", Count)
) %>%
  layout(
    title = list(text = "Average Positive Ratings Ratio by Platform Count", font = list(size = 16)),
    xaxis = list(title = "Number of Platforms"),
    yaxis = list(title = "Average Positive Ratings Ratio"),
    hoverlabel = list(bgcolor = "white")
  )
# INSIGHTS:
# Games that support multiple platforms tend to have both higher average playtimes and higher positive
# ratings ratios. This may indicate that developers who invest in multi-platform support also invest more
# in overall game quality and content depth, or that the broader accessibility leads to more diverse
# player feedback.

6 Distribution Fitting

# Prepare data for fitting (remove 0s and 1s for beta distribution)
fit_data <- steam_clean$positive_ratio
fit_data <- fit_data[fit_data > 0.01 & fit_data < 0.99]

# Fit distributions
fit_norm <- fitdist(fit_data, "norm")
fit_beta <- fitdist(fit_data, "beta")
fit_gamma <- fitdist(fit_data, "gamma")

# Compare models using AIC
aic_comparison <- data.frame(
  Distribution = c("Normal", "Beta", "Gamma"),
  AIC = c(fit_norm$aic, fit_beta$aic, fit_gamma$aic)
)

aic_comparison %>%
  arrange(AIC) %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Distribution AIC
Beta -15963.267
Normal -10130.781
Gamma -4588.201
# Determine best fit
best_aic <- min(fit_norm$aic, fit_beta$aic, fit_gamma$aic)
best_dist <- c("Normal", "Beta", "Gamma")[c(fit_norm$aic, fit_beta$aic, fit_gamma$aic) == best_aic]
cat(sprintf("Best fitting distribution: %s\n", best_dist))
## Best fitting distribution: Beta
# Plot the distributions - adjust margins and layout
par(mfrow = c(2, 2), mar = c(4, 4, 2, 1), oma = c(0, 0, 2, 0))
plot.legend <- c("Normal", "Beta", "Gamma")

# Individual plots with separate error handling
tryCatch({
  denscomp(list(fit_norm, fit_beta, fit_gamma), legendtext = plot.legend)
}, error = function(e) {
  plot(1, 1, type = "n", xlab = "", ylab = "", main = "Error in density plot")
  text(1, 1, "Margins too large for density plot")
})

tryCatch({
  qqcomp(list(fit_norm, fit_beta, fit_gamma), legendtext = plot.legend)
}, error = function(e) {
  plot(1, 1, type = "n", xlab = "", ylab = "", main = "Error in QQ plot")
  text(1, 1, "Margins too large for QQ plot")
})

tryCatch({
  cdfcomp(list(fit_norm, fit_beta, fit_gamma), legendtext = plot.legend)
}, error = function(e) {
  plot(1, 1, type = "n", xlab = "", ylab = "", main = "Error in CDF plot")
  text(1, 1, "Margins too large for CDF plot")
})

tryCatch({
  ppcomp(list(fit_norm, fit_beta, fit_gamma), legendtext = plot.legend)
}, error = function(e) {
  plot(1, 1, type = "n", xlab = "", ylab = "", main = "Error in PP plot")
  text(1, 1, "Margins too large for PP plot")
})

# Add overall title
title("Comparison of Distribution Fits", outer = TRUE)

# INSIGHTS:
# The distribution fitting analysis reveals which statistical distribution best models the positive ratings
# ratio of Steam games. The AIC values indicate that the Beta distribution provides the best fit, which is
# expected for data bounded between 0 and 1. This suggests that ratings follow a predictable pattern that
# can be modeled for future game performance prediction.